import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sn
pd.set_option('display.max_columns', None)
df = pd.read_csv('../data/clean/stats_salaries.csv')
df.head()
| PLAYER_ID | PLAYER_NAME | START_POSITION | MIN | FGM | FGA | FG_PCT | FG3M | FG3A | FG3_PCT | FTM | FTA | FT_PCT | OREB | DREB | REB | AST | STL | BLK | TO | PF | PTS | PLUS_MINUS | TEAM_ABBREVIATION | TEAM_NAME | GAME_DATE_EST | SEASON_START | WON_GAME | SALARY | INFLATION_ADJ_SALARY | TEAM_PAYROLL | INFLATION_ADJ_TEAM_PAYROLL | LEAGUE_PAYROLL | INFLATION_ADJ_LEAGUE_PAYROLL | TEAM_IMPORTANCE | LEAGUE_IMPORTANCE | TEAM_MARKET_SIZE | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1630162 | Anthony Edwards | F | 36:22 | 4.0 | 10.0 | 0.400 | 3.0 | 8.0 | 0.375 | 4.0 | 4.0 | 1.000 | 0.0 | 8.0 | 8.0 | 5.0 | 3.0 | 1.0 | 1.0 | 1.0 | 15.0 | 5.0 | MIN | Minnesota Timberwolves | 2022-03-12 | 2021 | 1 | 10245480.0 | 10245480 | 137098327 | 137098327 | 4125163242 | 4125163242 | 0.074731 | 0.002484 | 0.033235 |
| 1 | 1630162 | Anthony Edwards | F | 34:27 | 9.0 | 19.0 | 0.474 | 4.0 | 11.0 | 0.364 | 3.0 | 3.0 | 1.000 | 0.0 | 3.0 | 3.0 | 5.0 | 1.0 | 0.0 | 0.0 | 3.0 | 25.0 | -3.0 | MIN | Minnesota Timberwolves | 2022-03-11 | 2021 | 0 | 10245480.0 | 10245480 | 137098327 | 137098327 | 4125163242 | 4125163242 | 0.074731 | 0.002484 | 0.033235 |
| 2 | 1630162 | Anthony Edwards | F | 25:29 | 7.0 | 15.0 | 0.467 | 2.0 | 8.0 | 0.250 | 0.0 | 0.0 | 0.000 | 0.0 | 1.0 | 1.0 | 3.0 | 2.0 | 1.0 | 4.0 | 2.0 | 16.0 | 7.0 | MIN | Minnesota Timberwolves | 2022-03-09 | 2021 | 1 | 10245480.0 | 10245480 | 137098327 | 137098327 | 4125163242 | 4125163242 | 0.074731 | 0.002484 | 0.033235 |
| 3 | 1630162 | Anthony Edwards | F | 32:22 | 7.0 | 13.0 | 0.538 | 1.0 | 5.0 | 0.200 | 2.0 | 2.0 | 1.000 | 2.0 | 2.0 | 4.0 | 4.0 | 4.0 | 0.0 | 4.0 | 5.0 | 17.0 | 7.0 | MIN | Minnesota Timberwolves | 2022-02-28 | 2021 | 1 | 10245480.0 | 10245480 | 137098327 | 137098327 | 4125163242 | 4125163242 | 0.074731 | 0.002484 | 0.033235 |
| 4 | 1630162 | Anthony Edwards | F | 37:46 | 5.0 | 13.0 | 0.385 | 1.0 | 6.0 | 0.167 | 4.0 | 6.0 | 0.667 | 1.0 | 2.0 | 3.0 | 5.0 | 1.0 | 1.0 | 2.0 | 3.0 | 15.0 | -21.0 | MIN | Minnesota Timberwolves | 2022-02-25 | 2021 | 0 | 10245480.0 | 10245480 | 137098327 | 137098327 | 4125163242 | 4125163242 | 0.074731 | 0.002484 | 0.033235 |
df.columns
Index(['PLAYER_ID', 'PLAYER_NAME', 'START_POSITION', 'MIN', 'FGM', 'FGA',
'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB',
'DREB', 'REB', 'AST', 'STL', 'BLK', 'TO', 'PF', 'PTS', 'PLUS_MINUS',
'TEAM_ABBREVIATION', 'TEAM_NAME', 'GAME_DATE_EST', 'SEASON_START',
'WON_GAME', 'SALARY', 'INFLATION_ADJ_SALARY', 'TEAM_PAYROLL',
'INFLATION_ADJ_TEAM_PAYROLL', 'LEAGUE_PAYROLL',
'INFLATION_ADJ_LEAGUE_PAYROLL', 'TEAM_IMPORTANCE', 'LEAGUE_IMPORTANCE',
'TEAM_MARKET_SIZE'],
dtype='object')
df = df.loc[df['MIN'] != '0:00']
We want to create a matrix of scatter plots and actually visualize the correlations between each numerical variable.
avg_df = df.groupby(['SEASON_START', 'PLAYER_ID']).mean()
avg_df.head()
| FGM | FGA | FG_PCT | FG3M | FG3A | FG3_PCT | FTM | FTA | FT_PCT | OREB | DREB | REB | AST | STL | BLK | TO | PF | PTS | PLUS_MINUS | WON_GAME | SALARY | INFLATION_ADJ_SALARY | TEAM_PAYROLL | INFLATION_ADJ_TEAM_PAYROLL | LEAGUE_PAYROLL | INFLATION_ADJ_LEAGUE_PAYROLL | TEAM_IMPORTANCE | LEAGUE_IMPORTANCE | TEAM_MARKET_SIZE | ||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| SEASON_START | PLAYER_ID | |||||||||||||||||||||||||||||
| 2003 | 15 | 1.440000 | 3.840000 | 0.343820 | 0.860000 | 2.440000 | 0.309060 | 0.280000 | 0.320000 | 0.143340 | 0.200000 | 1.280000 | 1.480000 | 0.520000 | 0.320000 | 0.100000 | 0.580000 | 0.860000 | 4.020000 | -0.300000 | 0.440000 | 2500000.0 | 3697550.0 | 55401430.0 | 81939823.0 | 1.672617e+09 | 2.473835e+09 | 0.045125 | 0.001495 | 0.033123 |
| 43 | 1.133333 | 2.800000 | 0.311267 | 0.533333 | 1.133333 | 0.283333 | 0.333333 | 0.333333 | 0.200000 | 0.200000 | 0.666667 | 0.866667 | 0.933333 | 0.400000 | 0.133333 | 0.400000 | 0.400000 | 3.133333 | -3.466667 | 0.200000 | 1070000.0 | 1582551.0 | 45682902.0 | 67565925.0 | 1.672617e+09 | 2.473835e+09 | 0.023422 | 0.000640 | 0.027312 | |
| 56 | 5.288462 | 11.634615 | 0.450538 | 0.663462 | 2.125000 | 0.250154 | 1.932692 | 2.692308 | 0.528808 | 0.875000 | 3.105769 | 3.980769 | 5.432692 | 1.144231 | 0.221154 | 1.721154 | 2.201923 | 13.173077 | 3.076923 | 0.663462 | 4917000.0 | 7272342.0 | 64123085.0 | 94839328.0 | 1.672617e+09 | 2.473835e+09 | 0.076681 | 0.002940 | 0.038337 | |
| 57 | 3.968085 | 8.819149 | 0.438585 | 0.659574 | 1.861702 | 0.246202 | 2.010638 | 2.340426 | 0.608915 | 0.914894 | 3.372340 | 4.287234 | 4.191489 | 1.840426 | 0.489362 | 1.882979 | 2.351064 | 10.606383 | 3.882979 | 0.659574 | 6900000.0 | 10205238.0 | 68372826.0 | 101124779.0 | 1.672617e+09 | 2.473835e+09 | 0.100917 | 0.004125 | 0.040878 | |
| 72 | 2.395833 | 5.479167 | 0.394979 | 0.020833 | 0.083333 | 0.020833 | 0.729167 | 1.000000 | 0.288187 | 0.395833 | 1.312500 | 1.708333 | 2.625000 | 0.562500 | 0.104167 | 1.145833 | 1.583333 | 5.541667 | 1.312500 | 0.750000 | 1070000.0 | 1582551.0 | 58156236.0 | 86014236.0 | 1.672617e+09 | 2.473835e+09 | 0.018399 | 0.000640 | 0.034770 |
sn.pairplot(avg_df)
<seaborn.axisgrid.PairGrid at 0x2070213a8b0>